# GSS_coding.R

# Part of the replication archive for 
#
#   Bullock, John G. 2020. "Education and Attitudes toward Redistribution in
#   the United States." British Journal of Political Science 50.


# This file loads and codes the cumulative GSS dataset. It also merges the 
# GSS dataset with data on compulsory attendance laws and with demographic and
# political data that are measured at the state-year level. 

library(Bullock, lib.loc = c(.libPaths(), 'packageLibrary'))   # for %IN%, merge_fac(), qw()
library(car)       # for Recode()
library(dplyr)     # for %>%, left_join(), mutate(), select()
library(forcats)   # for fct_relabel()
library(haven)     # for read_dta(), read_spss()
library(stringr)   # for str_pad()

GSS_cumulative_filename <- 'GSS7218_R1.DTA'
if (! file.exists(paste0("data/", GSS_cumulative_filename))) {
  GSS_cumulative <- tempfile(fileext = '.zip')
  download.file(
    url      = 'http://www.gss.norc.org/documents/stata/GSS_stata.zip', 
    destfile = GSS_cumulative)
  unzip(GSS_cumulative, GSS_cumulative_filename, exdir = 'data')
}
GSS <- read_dta(paste0("data/", GSS_cumulative_filename)) %>%
  select(
    year,
    reg16,
    educ,
    degree,
    paeduc,
    maeduc,
    padeg,
    madeg,
    cohort,
    id,
    age,
    sex,
    race,
    born,
    eqwlth,
    goveqinc,
    eqincome,
    helppoor,
    natfare,
    marital,
    realrinc,
    povline,
    prestige,
    papres16,
    papres80,
    goodlife,
    starts_with('pay'),
    starts_with('giv'),
    getahead,
    welfare1,
    wordsum,    
    occ,
    occ80) %>%
  as_factor() %>%
  mutate_if(is.factor, fct_relabel, toupper)  # change factor levels to upper case


# LOAD RESTRICTED DATA
# The state-of-interview ("fipsstat") and state-of-residence-at-age-16 
# ("fipsstat16" or "state16") variables are restricted. To get them, you must
# apply to the GSS. See 
# https://gss.norc.org/documents/other/ObtainingGSSSensitiveDataFiles.pdf.
GSS_restricted <- read_spss(
  file = 'dataRestricted/GSS7210_xsection_for release+STATE16 for release.sav') %>%
  as_factor() %>%
  mutate_at(qw("fipsstat fipsstat16"), as.character)
GSS_state16_2012      <- read.csv(
  file             = 'dataRestricted/State16-2012.csv', 
  stringsAsFactors = FALSE)  
GSS_stateContemp_2012 <- read.csv('dataRestricted/stateOfInterview2012.csv') %>%
  rename(state = "STATE") %>%
  mutate(year = 2012)
GSS_stateOfInterview_1972_1977 <- read_spss(
  file = 'dataRestricted/GSS_restricted_current-state_data.sav') %>%
  as_factor() %>%
  mutate(fipsstat = as.character(fipsstat))


# LOAD AUXILIARY FILES
source('CSL_coding.R')  # coding of attendance laws  
source('functions/mergeStateControlVars.R')
                           


##############################################################################
# CODE THE STATE-OF-RESIDENCE-AT-16 VARIABLE
##############################################################################
# GSS$state16, a restricted variable, starts in 1978. (NORC may have lost
# the data from 1972-77.) I name the variable "stateYoung" for compatibility 
# with other files in the replication archive.
attributes(GSS$id) <- NULL
attributes(GSS$year) <- NULL
attributes(GSS_restricted$id) <- NULL
attributes(GSS_restricted$year) <- NULL

tmp <- bind_rows(GSS_restricted[, qw("year id fipsstat16")], GSS_state16_2012)
GSS <- left_join(GSS, tmp, by = qw("year id")) %>%
  rename(stateYoung = "fipsstat16") %>%
  mutate(stateYoung = Recode(.$stateYoung, 'c("Foreign", "AK", "HI", "") = NA', as.factor = TRUE) )


# CHECK STATE OF RESIDENCE AT 16 AGAINST REGION AT AGE 16
# The state16 variable is restricted, but the reg16 variable is public-access.
# The command below shows that almost all people coded as living in a given 
# state are also coded as being in the same region. This is as it should be; 
# it suggests that the state16 data, which was first digitized and coded by 
# the GSS for this article, has been correctly coded.
table(GSS$reg16, GSS$stateYoung)


  
##############################################################################
# VARIABLES NEEDED FOR EDUCATION/CSL ANALYSIS
##############################################################################
# As with the ANES, we use compulsory schooling laws in the state one lived in
# when a teenager as an instrument for the amount of education received.
#
# Unless otherwise noted, the "parents" variable applies to the status of both 
# parents and is TRUE only if the status is true of both parents.  For example,
# GSS$parents.noHS is TRUE only if both parents have an educational status of
# 'LT HIGH SCHOOL'.  Otherwise, it is FALSE or NA.  
GSS$educOriginal     <- GSS$educ  # highest year of school completed
GSS$educ             <- Recode(GSS$educ, 'c("IAP", "DK", "NA")=NA', as.factor = FALSE)   
GSS$yearsTo13        <- pmin(GSS$educ, 13)
GSS$years.after.HS.1 <- pmax(0, GSS$educ - 12)
GSS$educ.after.HS    <- GSS$educ >= 13 
GSS$HSgrad           <- GSS$educ >= 12 
GSS$min9             <- GSS$educ >=  9 
GSS$college          <- ordered(
  x      = GSS$degree, 
  levels = c(
    'LT HIGH SCHOOL', 'HIGH SCHOOL', 'JUNIOR COLLEGE', 'BACHELOR', 
    'GRADUATE')) >= 'BACHELOR'
GSS$collegeAttended  <- GSS$educ >= 13
GSS$degree.fa        <- ordered(
  x      = GSS$padeg, 
  levels = c('LT HIGH SCHOOL', 'HIGH SCHOOL', 'JUNIOR COLLEGE', 'BACHELOR', 'GRADUATE'))
GSS$degree.mo        <- ordered(
  x      = GSS$madeg, 
  levels = c('LT HIGH SCHOOL', 'HIGH SCHOOL', 'JUNIOR COLLEGE', 'BACHELOR', 'GRADUATE'))
GSS$parents.noHS     <- 
  GSS$degree.fa < 'HIGH SCHOOL' & 
  GSS$degree.mo < 'HIGH SCHOOL'
GSS$parents.noBA     <- 
  GSS$degree.fa < 'BACHELOR' & 
  GSS$degree.mo < 'BACHELOR' 
GSS$parents.HSnoBA   <- 
  GSS$degree.fa %IN% c('HIGH SCHOOL', 'JUNIOR COLLEGE') &  
  GSS$degree.mo %IN% c('HIGH SCHOOL', 'JUNIOR COLLEGE') 
GSS$parents.HS_orMore <- 
  GSS$degree.fa >= 'HIGH SCHOOL' & 
  GSS$degree.mo >= 'HIGH SCHOOL'
GSS$parents.BA_orMore <- 
  GSS$degree.fa >= 'BACHELOR' & 
  GSS$degree.mo >= 'BACHELOR'
GSS$parents.minOneHS <-
  GSS$degree.fa >= 'HIGH SCHOOL' | 
  GSS$degree.mo >= 'HIGH SCHOOL'
GSS$parents.minOneBA <-
  GSS$degree.fa >= 'BACHELOR' | 
  GSS$degree.mo >= 'BACHELOR'
GSS$parents.minOneHS_noBA  <- GSS$parents.minOneHS  & GSS$parents.noBA
GSS$parents.onlyOneHS      <- GSS$parents.minOneHS  & !GSS$parents.HS_orMore
GSS$parents.onlyOneBA      <- GSS$parents.minOneBA  & !GSS$parents.BA_orMore
GSS$parents.onlyOneHS_noBA <- GSS$parents.onlyOneHS & !GSS$parents.minOneBA

if (interactive()) {
  # Check to ensure that the "TRUE, TRUE" cell has zero entries
  with(GSS, table(parents.noHS, parents.HSnoBA))
  
  # Check to ensure that the "FALSE, TRUE" cell has zero entries
  with(GSS, table(parents.minOneBA, parents.BA_orMore))
}


# YEAR OF INTERVIEW, YEAR OF BIRTH, AND RELATED YEARS
GSS$yearInt       <- as.integer(GSS$year)  # year of interview
GSS$yearInt.fac   <- droplevels(ordered(Recode(GSS$yearInt, '1994 = 1996')))
GSS$YOB           <- GSS$cohort %>%  # year of birth
  Recode(., 'c("IAP", "NA", 9999)=NA') %>%
  as.character %>%
  as.integer
GSS$year14        <- GSS$YOB + 14    # year that R turned 14
GSS$yearYoung     <- GSS$year14 
GSS$yearYoung.fac <- ordered(Recode(GSS$yearYoung, '1994 = 1996'))


# CODE STATE AT TIME OF INTERVIEW
GSS_stateContemp_2012$fipsstat <- setNames(state.name, state.abb)[as.character(GSS_stateContemp_2012$state)]
GSS_stateContemp_2012$fullStateName[GSS_stateContemp_2012$state == 'DC'] <- 'District of Columbia'   
tmp <- bind_rows(
    GSS_stateOfInterview_1972_1977,
    GSS_restricted,
    GSS_stateContemp_2012) %>%
  select(year, id, fipsstat)
GSS <- left_join(GSS, tmp, by = qw("year id")) %>%
  mutate(state.contemp = Recode(.$fipsstat, 'c("0", "Alaska", "Hawaii")=NA; "District of Columbia"="DC"')
)
for (i in 1:length(state.name)) {
  recodeString <- paste0('"', state.name[i], '"="', state.abb[i], '"')
  GSS$state.contemp <- Recode(GSS$state.contemp, recodeString, as.factor = TRUE)
}  
rm(list = ls()[!ls() %in% qw("CSLdata GSS mergeStateControlVars")])



##############################################################################
# CREATE RESPONDENT ID VARIABLES
##############################################################################
# The GSS "id" variable restarts at 1 every year. 
GSS$respondentID <- paste0(GSS$yearInt, str_pad(GSS$id, width = 4, pad = '0'))



##############################################################################
# CREATE REGION-AT-16 VARIABLE
##############################################################################
GSS$regionYoung <- factor(
  x      = rep(NA, length(GSS$stateYoung)), 
  levels = c('North Central', 'Northeast', 'South', 'West'))
GSS$regionYoung[GSS$stateYoung %in% qw('CT ME MA NH NJ NY PA RI VT')] <- 'Northeast'
GSS$regionYoung[GSS$stateYoung %in% qw('IL IN IA KS MI MN MO NE ND OH SD WI')] <- 'North Central'
GSS$regionYoung[GSS$stateYoung %in% qw('AL AR DE DC FL GA KY LA MD MS NC OK SC TN TX VA WV')] <- 'South'
GSS$regionYoung[GSS$stateYoung %in% qw('AK AZ CA CO HI ID MT NV NM OR UT WA WY')] <- 'West'
GSS$southYoung <- GSS$regionYoung == 'South'
GSS$ryYoung    <- GSS$regionYoung:factor(GSS$year14)



##############################################################################
# CONTROL VARIABLES
##############################################################################
GSS$age         <- Recode(
  var       = GSS$age, 
  recodes   = '"89 OR OLDER"=89; c("NA", "DK")=NA', 
  as.factor = FALSE)
GSS$female     <- GSS$sex=='FEMALE' 
GSS$race       <- Recode(
  var     = droplevels(GSS$race), 
  recodes = '"WHITE"="white"; "BLACK"="black"; "OTHER"="otherRace"')
GSS$race       <- relevel(GSS$race, 'white')
GSS$black      <- GSS$race == 'black'
GSS$white      <- GSS$race == 'white'
GSS$nonwhite   <- GSS$race != 'white'
GSS$otherRace  <- GSS$race == 'otherRace' 
GSS$bornInUS   <- Recode(GSS$born, 'c("IAP", "DK", "NA")=NA' ) == 'YES' 

# Segregation-related measures
GSS$blackPostBrown  <- 
  GSS$race == 'black'   & 
  GSS$yearYoung >= 1958 & 
  GSS$stateYoung %IN% c(
    'AL', 'AR', 'DE', 'DC', 'FL', 'GA', 'KY', 'LA', 'MD', 'MS', 'NC', 'OK', 
    'SC', 'TN', 'TX', 'VA', 'WV')
GSS$MSDuringRepeal <- 
  GSS$yearYoung %IN% 1957:1982 & 
  GSS$stateYoung == 'MS' & 
  GSS$race == 'white'
GSS$SCDuringRepeal <- 
  GSS$yearYoung %IN% 1956:1971 & 
  GSS$stateYoung == 'SC' &
  GSS$race == 'white'
GSS$duringRepeal <- (GSS$MSDuringRepeal | GSS$SCDuringRepeal) & GSS$race == 'white'



##############################################################################
# GENERAL OUTCOME VARIABLES
##############################################################################
# Coding more liberal options to have lower values.
#
# goveqinc: "Government in Washington ought to reduce income differences,"   
# 1-7 scale.  

# helppoor: Some people think that the government in Washington should do 
# everything possible to improve the standard of living of all poor 
# Americans; they are at Point 1 on this card. Other people think it is not 
# the government's responsibility, and that each person should take care of
# himself; they are at Point 5.
GSS$eqwlthOriginal   <- GSS$eqwlth
GSS$goveqincOriginal <- GSS$goveqinc
GSS$eqincomeOriginal <- GSS$eqincome
GSS$helppoorOriginal <- GSS$helppoor


GSS$eqwlth     <- GSS$eqwlthOriginal %>%
  Recode(., '"GOVT REDUCE DIFF"=1; "NO GOVT ACTION"=7; c(0, 8, 9, "DK", "IAP", "NA")=NA') %>%  
  as.character() %>%
  as.integer()
GSS$goveqinc   <- ordered(
  x      = Recode(GSS$goveqincOriginal, 'c("IAP", "NA")=NA; "CANT CHOOSE"="NEITHER"'), 
  levels = c('STRONGLY AGREE', 'AGREE', 'NEITHER', 'DISAGREE', 'STRONGLY DISAGREE'))
GSS$eqincome <- ordered(
  x      = GSS$eqincomeOriginal, 
  levels = c('AGREE STRONGLY', 'AGREE', 'NEITHER', 'DISAGREE', 'DISAGREE STRONGLY'),
  labels = c('STRONGLY AGREE', 'AGREE', 'NEITHER', 'DISAGREE', 'STRONGLY DISAGREE'))
GSS$goveqinc <- merge_fac(
    fac.names = c('goveqinc', 'eqincome'), 
    envir     = as.environment(GSS)) %>% 
  unclass()  
GSS$helppoor <- GSS$helppoorOriginal %>%
  Recode(., '"GOVT ACTION"=1; "AGREE WITH BOTH"=3; "PEOPLE HELP SELVES"=5; c(0, 8, 9, "DK", "IAP", "NA")=NA') %>%    
  as.integer()
GSS$welfare <- unclass(ordered(GSS$natfare, levels = c('TOO LITTLE', 'ABOUT RIGHT', 'TOO MUCH')))



##############################################################################
# MECHANISM VARIABLES
##############################################################################
GSS$married         <- car::Recode(GSS$marital, '"NA" = NA') == 'MARRIED'
recodeIncome <- function (x) {
  x <- Recode(x, 'c("DONT KNOW", "NA", "IAP")=NA', as.factor = FALSE)
  x
}
GSS$income86l <- recodeIncome(GSS$realrinc) %>% log()
GSS$noPov <- GSS$povline %>% na_if('NOT IMPUTABLE')
GSS$noPov <- GSS$noPov %IN% c('NOT POOR', 'NOT POOR -- IMPUTED')


# OCCUPATIONAL MECHANISMS
prestigeFactorToInteger <- function (x) {
  x <- car::Recode(x, 'c(0, "DK,NA,IAP") = NA')
  x <- as.character(x)
  x <- as.integer(x)
}
GSS$prestige                   <- prestigeFactorToInteger(GSS$prestige)
GSS$prestigeFather1970_measure <- prestigeFactorToInteger(GSS$papres16)
GSS$prestigeFather1980_measure <- prestigeFactorToInteger(GSS$papres80)
GSS$prestigeFather             <- GSS$prestigeFather1970_measure
GSS$prestigeFather[is.na(GSS$prestigeFather)] <- GSS$prestigeFather1980_measure[is.na(GSS$prestigeFather)]
GSS$prestigeMobilityDummy <- GSS$prestige > GSS$prestigeFather  # Alesina and La Ferrara (2005, 905) use this measure


# SUBJECTIVE MEASURE OF UPWARD MOBILITY
# goodlife: The way things are in America, people like me and my family have a 
# good chance of improving our standard of living -- do you agree or disagree?
GSS$SOL_gettingBetter <- Recode(
  var       = GSS$goodlife,
  recodes   = '"STRONGLY DISAGREE"=1; "DISAGREE"=2; "NEITHER"=3; "AGREE"=4; "STRONGLY AGREE"=5; else=NA',
  as.factor = FALSE)



# HOW MUCH DO PEOPLE IN VARIOUS OCCUPATIONS MAKE?
# In 1987 and in 2000, the GSS asked how much people in various jobs are paid.
# Following Kris-Stella Trump, we can take the ratio between the highest- and 
# lowest-paid job as an indication of the extent of actual income inequality 
# that a respondent perceives. 
recodeIncomeStatements <- function (x) {
  stopifnot('factor' %in% class(x))
  droplevels(x) %>%
    Recode(., '"> 1000000"=1000000; c("LT NOW PAID", "MT NOW PAID", "SAME AS NOW PAID", "NEVER WORKED", "DK", "NA")=NA') %>%
    as.character() %>%
    { suppressWarnings(as.integer(.)) }
}
GSS$actualIncomeBricklayer         <- recodeIncomeStatements(GSS$paymason)
GSS$actualIncomeDoctor             <- recodeIncomeStatements(GSS$paydoc)
GSS$actualIncomeBankClerk          <- recodeIncomeStatements(GSS$payclerk)
GSS$actualIncomeShopOwner          <- recodeIncomeStatements(GSS$payowner)
GSS$actualIncomeCEO                <- recodeIncomeStatements(GSS$payexec)
GSS$actualIncomeSkilledWorker      <- recodeIncomeStatements(GSS$payskill)
GSS$actualIncomeFarmWorker         <- recodeIncomeStatements(GSS$payfarm)
GSS$actualIncomeSecretary          <- recodeIncomeStatements(GSS$paysec)
GSS$actualIncomeBusDriver          <- recodeIncomeStatements(GSS$paybus)
GSS$actualIncomeUnskilledWorker    <- recodeIncomeStatements(GSS$payunskl)
GSS$actualIncomeCabinetMember      <- recodeIncomeStatements(GSS$paycabnt)
GSS$actualIncomeLawyer             <- recodeIncomeStatements(GSS$paylaw)
GSS$actualIncomeSaleClerk          <- recodeIncomeStatements(GSS$paysales)
GSS$actualIncomeSupCtJustice       <- recodeIncomeStatements(GSS$payjudge)
GSS$actualIncomeSelfEmployedWorker <- recodeIncomeStatements(GSS$payrocc)

actualIncomeQuestionNames <- grep('^actualIncome(?!Matrix)', names(GSS), value = TRUE, perl = TRUE)
GSS$actualIncomeMatrixRatio   <- apply(
  X      = GSS[, actualIncomeQuestionNames], 
  MARGIN = 1, 
  FUN    = function (x) {
    if (all(is.na(x))) { return(NA) }
    min(x, na.rm = TRUE) / max(x, na.rm = TRUE)
  }
)

# HOW MUCH SHOULD PEOPLE IN VARIOUS OCCUPATIONS MAKE?
GSS$idealIncomeBricklayer         <- recodeIncomeStatements(GSS$givmason)
GSS$idealIncomeDoctor             <- recodeIncomeStatements(GSS$givdoc)
GSS$idealIncomeBankClerk          <- recodeIncomeStatements(GSS$givclerk)
GSS$idealIncomeShopOwner          <- recodeIncomeStatements(GSS$givowner)
GSS$idealIncomeCEO                <- recodeIncomeStatements(GSS$givexec)
GSS$idealIncomeSkilledWorker      <- recodeIncomeStatements(GSS$givskill)
GSS$idealIncomeFarmWorker         <- recodeIncomeStatements(GSS$givfarm)
GSS$idealIncomeSecretary          <- recodeIncomeStatements(GSS$givsec)
GSS$idealIncomeBusDriver          <- recodeIncomeStatements(GSS$givbus)
GSS$idealIncomeUnskilledWorker    <- recodeIncomeStatements(GSS$givunskl)
GSS$idealIncomeCabinetMember      <- recodeIncomeStatements(GSS$givcabnt)
GSS$idealIncomeLawyer             <- recodeIncomeStatements(GSS$givlaw)
GSS$idealIncomeSaleClerk          <- recodeIncomeStatements(GSS$givsales)
GSS$idealIncomeSupCtJustice       <- recodeIncomeStatements(GSS$givjudge)
GSS$idealIncomeSelfEmployedWorker <- recodeIncomeStatements(GSS$givrocc)
idealIncomeQuestionNames <- grep('^idealIncome(?!Matrix)', names(GSS), value = TRUE, perl = TRUE)
GSS$idealIncomeMatrixRatio   <- apply(
  X      = GSS[, idealIncomeQuestionNames], 
  MARGIN = 1, 
  FUN    = function (x) {
    if (all(is.na(x))) { return(NA) }
    min(x, na.rm = TRUE)/max(x, na.rm = TRUE)
  }
)



# PSYCHOLOGICAL MECHANISMS
# The "getahead" question is "Some people say that people get ahead by their 
# own hard work; others say that lucky breaks or help from other people are 
# more important. Which do you think is most important?"
GSS$luckOutweighsWork <- unclass(
  ordered(
    x      = GSS$getahead, 
    levels = c('HARD WORK', 'BOTH EQUALLY', 'LUCK OR HELP'))) 
GSS$workOutweighsLuck <- Recode(GSS$luckOutweighsWork, '1=3; 3=1')
GSS$welfareCausesLaziness <- Recode(
  var       = GSS$welfare1,
  recodes   = 'c("IAP", "NA")=NA; "STRONGLY DISAGREE"=1; "DISAGREE"=2; "DK"=3; "AGREE"=4; "STRONGLY AGREE"=5',
  as.factor = FALSE)
GSS$wordsum             <- Recode(
  var     = GSS$wordsum, 
  recodes = 'c(-1, "IAP")=NA; c(99, "DID NOT TRY")=0') %>%  
  as.integer()




##############################################################################
# MERGE STATE-LEVEL CHARACTERISTICS INTO GSS CUMULATIVE FILE
############################################################################## 
GSS <- bind_cols(GSS, mergeStateControlVars(GSS$stateYoung, GSS$yearYoung))



##############################################################################
# MERGE CSL INSTRUMENTS INTO GSS CUMULATIVE FILE 
##############################################################################
GSS <- left_join(
  x  = GSS, 
  y  = CSLdata[, qw("state year CA CL work_age drop_age")],
  by = c("stateYoung" = "state", "yearYoung" = "year"))



##############################################################################
# SAVE CSL-MERGED ANALYSIS
##############################################################################
saveRDS(GSS, file = 'data/GSS_withMergedCSLs.RDS')